/* * This file is part of CoAnSys project. * Copyright (c) 2012-2015 ICM-UW * * CoAnSys is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * CoAnSys is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with CoAnSys. If not, see <http://www.gnu.org/licenses/>. */ package pl.edu.icm.coansys.commons.pig.udf; import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Method; import java.util.EnumMap; import java.util.Locale; import java.util.Map; import org.apache.pig.EvalFunc; import org.apache.pig.backend.executionengine.ExecException; import org.apache.pig.data.DataBag; import org.apache.pig.data.DataByteArray; import org.apache.pig.data.DataType; import org.apache.pig.data.Tuple; import org.apache.pig.impl.logicalLayer.schema.Schema; import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema; import com.google.protobuf.AbstractMessage.Builder; import com.google.protobuf.Descriptors.Descriptor; import com.google.protobuf.Descriptors.EnumDescriptor; import com.google.protobuf.Descriptors.EnumValueDescriptor; import com.google.protobuf.Descriptors.FieldDescriptor; import com.google.protobuf.Descriptors.FieldDescriptor.Type; import com.google.protobuf.DynamicMessage; import com.google.protobuf.Message; /** * Pig UDF converting pig tuple to protocol buffers message. * * @author Artur Czeczko <a.czeczko@icm.edu.pl> */ public class TupleToProtoBytearray extends EvalFunc<DataByteArray> { private Class<? extends Message> protobufClass; /** * A map between protobuf types and java class used in pig data */ @SuppressWarnings("rawtypes") private static final Map<Type, Class> protobufToJavaTypes = new EnumMap<Type, Class>(Type.class); static { protobufToJavaTypes.put(Type.STRING, String.class); protobufToJavaTypes.put(Type.INT32, Integer.class); protobufToJavaTypes.put(Type.SINT32, Integer.class); protobufToJavaTypes.put(Type.UINT32, Integer.class); protobufToJavaTypes.put(Type.INT64, Long.class); protobufToJavaTypes.put(Type.SINT64, Long.class); protobufToJavaTypes.put(Type.UINT64, Long.class); protobufToJavaTypes.put(Type.FLOAT, Float.class); protobufToJavaTypes.put(Type.DOUBLE, Double.class); protobufToJavaTypes.put(Type.BOOL, Boolean.class); protobufToJavaTypes.put(Type.ENUM, String.class); protobufToJavaTypes.put(Type.MESSAGE, Tuple.class); protobufToJavaTypes.put(Type.BYTES, DataByteArray.class); } /** * This constructor cannot be called directly in pig latin scripts, but it * can be used in default constructor of a subclass. * * @param protobufClass a class of protocol buffers messages */ public TupleToProtoBytearray(Class<? extends Message> protobufClass) { this.protobufClass = protobufClass; } /** * Constructor with a protobuf class name. It can be called directly from * pig latin scripts, i.e.: <p> define myUDF * pl.edu.icm.coansys.commons.pig.udf.TupleToProtoBytearray("protobufClassName"); * <p> FOREACH data GENERATE myUDF($0); * * @param protobufClassName * @throws ClassNotFoundException */ @SuppressWarnings("unchecked") public TupleToProtoBytearray(String protobufClassName) throws ClassNotFoundException { this((Class<? extends Message>) Class.forName(protobufClassName)); } /** * Returns a data schema to pig scripts (with a single BYTEARRAY field) * * @param input * @return */ @Override public Schema outputSchema(Schema input) { return new Schema(new FieldSchema(protobufClass.getName().toLowerCase(Locale.ENGLISH), DataType.BYTEARRAY)); } /** * Converts data from tuple to serialized protocol buffers message * * @param input * @return * @throws ExecException */ @Override public DataByteArray exec(Tuple input) throws ExecException { Method method; try { method = protobufClass.getMethod("newBuilder"); } catch (NoSuchMethodException ex) { throw new ExecException(ex); } catch (SecurityException ex) { throw new ExecException(ex); } Builder<?> builder; try { builder = (Builder<?>) method.invoke(null); } catch (IllegalArgumentException ex) { throw new ExecException(ex); } catch (InvocationTargetException ex) { throw new ExecException(ex); } catch (SecurityException ex) { throw new ExecException(ex); } catch (IllegalAccessException ex) { throw new ExecException(ex); } Message message = recursiveConvert(input, builder); return new DataByteArray(message.toByteArray()); } @SuppressWarnings("unchecked") private Message recursiveConvert(Tuple input, Builder<?> builder) throws ExecException { Descriptor descr = builder.getDescriptorForType(); if (descr.getFields().size() != input.size()) { throw new ExecException("Input tuple size doesn't match protobuf schema size"); } for (FieldDescriptor protobufField : descr.getFields()) { Object tupleField = input.get(protobufField.getIndex()); Type protobufType = protobufField.getType(); if (tupleField == null) { if (protobufField.isRequired()) { throw new ExecException("There is no data for required field " + protobufField.getName()); } continue; } if (protobufField.isRepeated()) { //repeated field if (!(tupleField instanceof DataBag)) { throw new ExecException("Data for repeated field must be in a DataBag, instead of" + tupleField.getClass().getName()); } for (Tuple tpl : ((DataBag) tupleField)) { if (!protobufToJavaTypes.containsKey(protobufType)) { throw new ExecException("Type not supported: " + protobufType); } if (protobufType.equals(Type.MESSAGE)) { Builder<?> subbuilder = DynamicMessage.newBuilder(protobufField.getMessageType()); builder.addRepeatedField(protobufField, recursiveConvert(tpl, subbuilder)); } else { //scalar type Object tplObj = tpl.get(0); if (!protobufToJavaTypes.get(protobufType).isAssignableFrom(tplObj.getClass())) { throw new ExecException("Data type not compatible: " + protobufType + ", " + tplObj.getClass().getName()); } if (protobufType.equals(Type.ENUM)) { EnumDescriptor enumDescr = protobufField.getEnumType(); EnumValueDescriptor enumValueDescr = enumDescr.findValueByName((String) tplObj); builder.addRepeatedField(protobufField, enumValueDescr); } else { builder.addRepeatedField(protobufField, tplObj); } } } } else { //not repeated if (!protobufToJavaTypes.get(protobufType).isAssignableFrom(tupleField.getClass())) { throw new ExecException("Data type not compatible: " + protobufType + ", " + tupleField.getClass().getName()); } if (protobufType.equals(Type.MESSAGE)) { Builder<?> subbuilder = DynamicMessage.newBuilder(protobufField.getMessageType()); builder.setField(protobufField, recursiveConvert((Tuple) tupleField, subbuilder)); } else if (protobufType.equals(Type.ENUM)) { EnumDescriptor enumDescr = protobufField.getEnumType(); EnumValueDescriptor enumValueDescr = enumDescr.findValueByName((String) tupleField); builder.setField(protobufField, enumValueDescr); } else { //scalar type builder.setField(protobufField, tupleField); } } } return builder.build(); } }